## This file calculates different versions of the vm.community.obj measure
## using weights that depend on the proportion of the area in the hand drawn
## map (the "community") that overlaps with the smallest level of 2006 Canadian
## Census geography for which we have proportion visible minority

library(here)
library(tidyverse) # load dplyr, tidyr, ggplot2 packages
library(conflicted)
conflicts_prefer(dplyr::filter)
conflicts_prefer(dplyr::select)
library(sp)
library(sf) # for manipulating polygons and reading shape files
library(sfheaders)
library(units)
library(stringi)
library(rosm)
library(ggspatial)
## These next are for the raster grid data and are not necessary
# library(exactextractr)
# library(terra)
# library(stars)
library(gridExtra)
library(geojsonsf)
library(rmapshaper)
library(future)
library(future.apply)

## Some issues finding proj.db for projections
## this next is not perfectly portable --- for example the version number might be different on your machine than 9.4.1

system_info <- Sys.info()
if (system_info["sysname"] == "Darwin") {
  Sys.setenv(PROJ_LIB = "/opt/homebrew/Cellar/proj/9.4.1/share/proj")
}
if (system_info["nodename"] == "koelsch") {
  Sys.setenv(PROJ_LIB = "/home/jwbowers/miniforge3/envs/mambaR/share/proj")
}
if (grepl("keeling", system_info["nodename"])) {
  Sys.setenv(PROJ_LIB = "/data/keeling/a/jwbowers/mambaforge/envs/mambaR/share/proj")
}

## Sys.getenv("PROJ_LIB")
## Should be something like this:
## [1] "/opt/homebrew/Cellar/proj/9.4.1/share/proj"

source(here("Data", "map_functions.R"))

## The census data from 2006
load(here::here("Data/CensusData/2006_Data", "census_data.rda"), verbose = TRUE)

### The survey data
load(here("Data", "survey.geo.rda"))
## the column "community2" is the first community drawn and "s2_community2"
## is the second one (a couple of months later). They are both sp class polygons, in meters, but unclear coordinate reference system.
## so, fine for assessing relationships between them and calculating their areas, but not great for interacting with other polygons like those
## from the Canadian Census.
## there are also columns with draw.community.data (with the raw longitude and latitude points for the polygons)

## We are going to create sf class polygons from the raw drawing data (which are longitude and latitude points)
## There are a few cases with missing drawing data
table(is.na(survey.geo$draw.community.data))
table(survey.geo$draw.community.data == "")

## Make a smaller dataset with just the drawings and the old sp class polygons
surv_drawn <- survey.geo %>%
  filter(!is.na(draw.community.data)) %>%
  # filter(!is.na(community2) & !is.null(community2) & valid1 & valid2) %>%
  select(vcid, dauid, csduid, da.pop, csd.pop, community_area_km, draw.community.data) %>%
  mutate(draw.community.data = as.character(draw.community.data)) %>%
  droplevels()
# Describe the strings a bit
surv_drawn$string_lengths <- stri_length(surv_drawn$draw.community.data)
surv_drawn$num_polys <- stri_count_fixed(surv_drawn$draw.community.data, ";") + 1
table(surv_drawn$string_lengths)[1:20]
table(surv_drawn$num_polys, exclude = c())

## Just check on the ID
stopifnot(length(unique(surv_drawn$vcid)) == nrow(surv_drawn))

## Split all of the answers into the individual polygons
raw_polygons <- stri_split_fixed(surv_drawn$draw.community.data, ";", omit_empty = TRUE)
names(raw_polygons) <- surv_drawn$vcid

## When there are more than one polygon, we get a vector of strings.
## This next has 3 polygon candidates
raw_polygons[["3363408"]]

## Problem polygons
#  63846  328781  365367  547135  885198  946528 1073437 1093735 1435946
#  1693836 1870832 1963193 2369954 2430889 2469753 2826532

## Do some work to clean up potentially invalid polygons at the stage where we go from long/lat to polygons.
## blah_885198 <- obj_to_poly(raw_polygons[["885198"]])
## g_blah_885198 <- blah_885198 %>%
##   ggplot() +
##   annotation_map_tile(type = "cartolight", progress = "none") +
##   geom_sf(alpha = .25) +
##   theme_minimal()
## ggsave(g_blah, file = "g_blah.pdf")

# coords_885198 <- lapply(raw_polygons[["885198"]], from_strings_to_coords)
# blah_polys <- lapply(coords_885198, make_polys)
# sapply(blah_polys, st_is_valid)
# lapply(blah_polys, class)
# blah_comb <- st_union(do.call(c, blah_polys))
# blah_sf <- st_sf(geometry = blah_comb)
# blah_sf %>%
#   ggplot() +
#   annotation_map_tile(type = "cartolight", progress = "none") +
#   geom_sf(alpha = .25) +
#   theme_minimal()

blah_1093735 <- obj_to_poly(raw_polygons[["1093735"]])
# coords_1093735 <- lapply(raw_polygons[["1093735"]], from_strings_to_coords)
# len_coords_1093735 <- sapply(coords_1093735, function(obj) {
#  res <- nrow(obj)
#  if(is.null(res)){ return(NA)} else {return(res)}
# })
# which(is.na(len_coords_1093735))

## This next is mainly to throw errors if it runs into strange situations
problem_polygons <- c(
  "63846", "328781", "365367", "547135", "885198", "946528",
  "1073437", "1093735", "1435946", "1693836", "1870832", "1963193", "2369954",
  "2430889", "2469753", "2826532"
)
prob_polys <- vector(mode = "list", length = length(problem_polygons))
names(prob_polys) <- problem_polygons
for (i in names(prob_polys)) {
  message(i)
  prob_polys[[i]] <- obj_to_poly(raw_polygons[[i]])
}
missing_prob_polys <- sapply(prob_polys, is.na)
prob_polys_good <- prob_polys[!missing_prob_polys]
## Only one more with a problem
sapply(1:length(prob_polys_good), function(i) {
  message(i)
  return(st_area(prob_polys_good[[i]]))
})
## prob_polys_good[10] ## `1963193`
## And we can use make_valid on it

## Using a for loop to catch errors and strange polygons
surv_polys <- vector(mode = "list", length = length(raw_polygons))
names(surv_polys) <- names(raw_polygons)
for (i in names(surv_polys)) {
  message(i)
  surv_polys[[i]] <- obj_to_poly(raw_polygons[[i]])
}

stopifnot(all.equal(surv_drawn$vcid, as.numeric(names(surv_polys))))

na_polys <- is.na(surv_polys)
## Mostly of the maps are not valid after processing
sum(na_polys)
surv_polys_not_missing <- surv_polys[!na_polys]

## Check that all of the polygons that we have left are valid
valid_polys <- sapply(surv_polys_not_missing, st_is_valid)
stopifnot(all(valid_polys))

## Now convert that list of polygons and multipolygons into an SF dataset
### This next is somewhat annoying but is necessary. It makes each list element an sfc_MULTIPOLYGON
surv_polys_not_missing2 <- sapply(surv_polys_not_missing, `[`)
## Extract the pieces from the list:
## surv_polys_not_missing2 <- lapply(surv_polys_not_missing, function(x) { x })
## The census data is in CRS 4326 but we think that the
## Google Recorded Lon/Lat pairs are in CRS 3857
surv_col <- st_sfc(surv_polys_not_missing2) # , crs = 3857) # ,crs=4326)
names(surv_col) <- names(surv_polys_not_missing2)
surv_col["885198"]
surv_sf_dat <- st_sf(geometry = surv_col)
surv_sf_dat$vcid <- as.numeric(names(surv_polys_not_missing2))
st_geometry(surv_sf_dat)
surv_sf_dat %>% filter(vcid == 885198)
## surv_sf_dat <- surv_sf_dat %>% filter(!st_is_empty(.))
## surv_sf_dat <- st_make_valid(surv_sf_dat)
## I was getting errors using CRS 4326 for this
surv_sf_dat <- surv_sf_dat %>% mutate(
  area_km2_crsnull = as.numeric(st_area(.)) / 1000000
)

## stopifnot(all(st_is_valid(surv_sf_dat)))
## We want the 4326 CRS for use with the census data
st_crs(surv_sf_dat) <- st_crs(census_data)
surv_sf_dat <- st_make_valid(surv_sf_dat)
new_bad_polys <- surv_sf_dat %>% filter(!st_is_valid(.))
new_bad_polys$vcid

## 16 bad ones
surv_sf_dat$bad_poly <- surv_sf_dat$vcid %in% new_bad_polys$vcid
table(surv_sf_dat$bad_poly, exclude = c())

# stopifnot(all(st_is_valid(surv_sf_dat)))
surv_sf_dat %>% filter(vcid == 885198)

## It turns out that I can get areas calculated for everything if I don't use
## the s2 package. So turning it off just for now.
## sf_use_s2(FALSE)
## sf_use_s2(TRUE)

area_for_good_polys <- function(x) {
  x <- st_make_valid(x)
  if (!st_is_valid(x)) {
    return(NA)
  } else {
    return(as.numeric(set_units(st_area(x), km^2)))
  }
}

surv_sf_dat$area_km2_crs4326 <- sapply(surv_sf_dat$geometry, area_for_good_polys)

summary(surv_sf_dat$area_km2_crs4326)
# sf_use_s2(TRUE)

## Try to come up with some area numbers even for polygons that are super weird
## bad_g <- lapply(1:nrow(new_bad_polys), function(i) {
##   ggplot() +
##     geom_sf(data = new_bad_polys[i, ])
## })
##
## For example, turning off spherical geometry
## sf_use_s2(FALSE)
## new_bad_polys$area_km2_crs4326_tmp <- sapply(new_bad_polys$geometry, function(x) {
##  as.numeric(set_units(st_area(x), km^2))
## })
## sf_use_s2(TRUE)
##
## summary(new_bad_polys$area_km2_crs4326_tmp)
##
## surv_sf_dat <- left_join(surv_sf_dat, new_bad_polys %>% select(vcid, area_km2_crs4326_tmp) %>% as.data.frame())
### stopifnot(nrow(surv_sf_dat2) == nrow(surv_sf_dat))
##
## surv_sf_dat <- surv_sf_dat %>% mutate(area_km2_crs4326 = ifelse(is.na(area_km2_crs4326), abs(area_km2_crs4326_tmp), area_km2_crs4326))
summary(surv_sf_dat$area_km2_crs4326)

save(surv_sf_dat, file = here("Data", "surv_sf_dat.rda"))

####### Calculate the average VM of the Maps using weighted or unweighted
### averages of the Census DA data

head(census_data)

## Using a for loop to catch errors and strange polygons
## mostly running this not on my local laptop but on a larger machine
plan(multicore, workers = length(availableWorkers()) - 2)
da_vmpop_avg_lst <- future_lapply(1:nrow(surv_sf_dat), function(i) {
  # da_vmpop_avg_lst <- future_lapply(1:100, function(i) {
  theid <- surv_sf_dat$vcid[i]
  message("vcid : ", theid)
  calc_map_avgs_da_wts(theid)
}, future.scheduling = 1, future.seed = NULL)
save(da_vmpop_avg_lst, file = here("Data", "da_vmpop_avg_lst.rda"))

plan(sequential)

# da_vmpop_avg_lst <- vector(mode = "list", length = nrow(surv_sf_dat))
# names(da_vmpop_avg_lst) <- surv_sf_dat$vcid
# for (i in 1:10){
#   theid <- surv_sf_dat$vcid[i]
#   message(theid)
#   da_vmpop_avg_lst[[as.character(theid)]] <- calc_map_avgs_da_wts(theid)
# }
# #da_vmpop_avg_lst[as.character(surv_sf_dat$vcid[1:10])]
# glimpse(da_vmpop_avg_lst)
